Bibliotecas utilizadas para o estudo

library(tidyverse)
library(lubridate)
library(corrplot)
library(plotly)
library(readxl)
library(fastDummies)
library(AmesHousing)
library(recipes)
library(caret)
library(urca)
library(car)
library(forecast)
library(rpart)
library(rpart.plot)
library(ranger)
library(randomForest)

Leitura dos dados de teste e treino, unificando em um único dataset e, por fim, transformações primarias.

# Treino #
baba_train <- read.csv("https://raw.githubusercontent.com/diogenesjusto/FIAP/master/SHIFT/202007/Desafio/train_BABA.csv")

baba_train$conj <- "train"

# Teste #

baba_test <- read.csv("https://raw.githubusercontent.com/diogenesjusto/FIAP/master/SHIFT/202007/Desafio/test_BABA.csv")

baba_test$venda <- NA
baba_test$conj <- "test"

# Uniao das bases

baba_all <- rbind(baba_train, baba_test)

baba_all <- baba_all %>% 
  select(-X) %>%
  mutate(
        date = ymd(date),
        month = floor_date(date, "month"),
        weekday = case_when( 
                              weekday == "segunda-feira" ~ 1,
                              weekday == "ter?a-feira"~ 2,
                              weekday == "quarta-feira" ~  3,
                              weekday == "quinta-feira" ~  4,
                              weekday == "sexta-feira" ~  5,
                              weekday == "s?bado" ~  6,
                              weekday == "domingo" ~ 7
                              )
        )

# Base de datas

dim_date <- read_excel("dim_date.xlsx")
dim_date$date_id <- as.Date(as.integer(dim_date$date_id), origin = "1899-12-30")

## join com base de datas

baba_all <- left_join(baba_all, dim_date, by = c("date" = "date_id"))

# Transformação para variaveis nominais

baba_all <- baba_all %>% 
  mutate(weekday = as.character(weekday), mes = as.character(mes), is_brz_holiday = as.character(is_brz_holiday), brz_season = as.character(brz_season))

Analises descritivas do dataset

glimpse(baba_all)
## Rows: 396
## Columns: 12
## $ date           <date> 2014-08-01, 2014-08-02, 2014-08-03, 2014-08-04, 201...
## $ mes            <chr> "agosto", "agosto", "agosto", "agosto", "agosto", "a...
## $ weekday        <chr> "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1...
## $ margem         <dbl> 0.4061105, 0.4160218, 0.4319928, 0.4092159, 0.449647...
## $ venda          <dbl> 110042.46, 58377.32, 64635.39, 140417.32, 149700.29,...
## $ desconto       <dbl> 9190.908, 5713.043, 8621.709, 18312.966, 19942.741, ...
## $ outdesc        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ outmg          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ conj           <chr> "train", "train", "train", "train", "train", "train"...
## $ month          <date> 2014-08-01, 2014-08-01, 2014-08-01, 2014-08-01, 201...
## $ is_brz_holiday <chr> "Noholiday", "Noholiday", "Noholiday", "Noholiday", ...
## $ brz_season     <chr> "Winter", "Winter", "Winter", "Winter", "Winter", "W...
summary(baba_all)
##       date                mes              weekday              margem      
##  Min.   :2014-08-01   Length:396         Length:396         Min.   :0.2907  
##  1st Qu.:2014-11-07   Class :character   Class :character   1st Qu.:0.3895  
##  Median :2015-02-14   Mode  :character   Mode  :character   Median :0.4159  
##  Mean   :2015-02-14                                         Mean   :0.4142  
##  3rd Qu.:2015-05-24                                         3rd Qu.:0.4372  
##  Max.   :2015-08-31                                         Max.   :0.5142  
##                                                                             
##      venda           desconto           outdesc             outmg         
##  Min.   : 16509   Min.   :   32.65   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.: 51103   1st Qu.: 3274.98   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median : 75280   Median : 5400.32   Median :0.000000   Median :0.000000  
##  Mean   : 80618   Mean   : 6735.14   Mean   :0.007576   Mean   :0.005051  
##  3rd Qu.:105553   3rd Qu.: 9142.65   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :577017   Max.   :48454.41   Max.   :1.000000   Max.   :1.000000  
##  NA's   :31                                                               
##      conj               month            is_brz_holiday      brz_season       
##  Length:396         Min.   :2014-08-01   Length:396         Length:396        
##  Class :character   1st Qu.:2014-11-01   Class :character   Class :character  
##  Mode  :character   Median :2015-02-01   Mode  :character   Mode  :character  
##                     Mean   :2015-01-30                                        
##                     3rd Qu.:2015-05-01                                        
##                     Max.   :2015-08-01                                        
## 

Analise Exploratória das variáveis

# Vendas por dia #

ggplotly(
  baba_all %>% 
    filter(conj == "train") %>%
    ggplot(aes(x = date, y = venda)) +
    geom_line(color = "black") +
    labs(
      x = "Data",
      y = "Vendas por dia"
    )+
    theme_bw()
)
# Vendas por mês #

ggplotly(
  baba_all %>%
    filter(conj == "train") %>%
    group_by(month) %>% 
    summarise(vendas = sum(venda)) %>% 
    ggplot(aes(x = month, y = vendas)) +
    geom_col(color = "blue", fill = "white") +
    labs(
      x = "Data",
      y = "Vendas por mes"
    )+
    theme_bw()
)
# vendas por estação do ano #

ggplotly(
   baba_all %>%
    filter(conj == "train") %>%
    group_by(brz_season) %>% 
    summarise(vendas = sum(venda)) %>% 
    ggplot(aes(x = brz_season, y = vendas, fill = brz_season)) +
    geom_col(color = "blue") +
    labs(
      x = "Data",
      y = "Vendas por estação"
    )+
    theme_bw()
)
# Proporção vendas feriado #

ggplotly(
   baba_all %>%
    filter(conj == "train") %>%
    group_by(is_brz_holiday) %>% 
    summarise(vendas = sum(venda)/n()) %>% 
    ggplot(aes(x = is_brz_holiday, y = vendas)) +
    geom_col(color = "blue") +
    labs(
      x = "Data",
      y = "Vendas proporção feriado sim/não"
    )+
    theme_bw()
)
# Boxplot Vendas por dia da semana #

ggplotly(baba_all %>% 
  filter(conj == "train") %>% 
  ggplot( aes(x = weekday, y = venda, fill = weekday))+
  geom_boxplot()+
  labs(
      x = "weekday",
      y = "Vendas"
    )+
    theme_bw()
)
# dispersão venda, desconto
plot(baba_train$venda, baba_train$desconto)

# dispersão venda, margem
plot(baba_train$venda, baba_train$margem)

# dispersão desconto, margem
plot(baba_train$desconto, baba_train$margem)

# Histograma de vendas
hist(baba_train$venda)

# Hist de margem
hist(baba_train$margem)

# Hist de desconto
hist(baba_train$desconto)

# Grafico de correlação das variáveis numéricas

baba_all %>%
  filter(conj == "train") %>%
  select_if(is.numeric) %>% 
  cor() %>% 
  corrplot(type = "upper")

Desenvolvimento do modelo de regressão linear, identificando e retirando outliers através do metodo de Distância de Cook

## Modelo de Regressao linear Multipla

baba_train <- baba_all %>%
  filter(conj == "train") %>%
  select(-conj, -date, -outdesc, -outmg, -month)

modelo <- lm(formula = venda ~ .,
                       data = baba_train)

summary(modelo)
## 
## Call:
## lm(formula = venda ~ ., data = baba_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -60314 -10843  -1809   9185 118606 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.031e+05  1.872e+04  10.846  < 2e-16 ***
## mesagosto               -5.487e+04  8.083e+03  -6.788 5.06e-11 ***
## mesdezembro             -3.461e+04  7.811e+03  -4.431 1.27e-05 ***
## mesfevereiro            -1.026e+04  7.485e+03  -1.371 0.171218    
## mesjaneiro              -9.923e+03  7.452e+03  -1.331 0.183917    
## mesjulho                -3.638e+04  7.494e+03  -4.854 1.84e-06 ***
## mesjunho                -1.741e+04  5.129e+03  -3.394 0.000769 ***
## mesmaio                 -7.343e+02  4.667e+03  -0.157 0.875065    
## mesmar?o                -3.263e+03  5.846e+03  -0.558 0.577073    
## mesnovembro             -1.733e+04  8.345e+03  -2.076 0.038612 *  
## mesoutubro              -2.498e+04  8.318e+03  -3.003 0.002868 ** 
## messetembro             -5.197e+04  7.487e+03  -6.940 1.98e-11 ***
## weekday2                -5.996e+03  3.531e+03  -1.698 0.090431 .  
## weekday3                -8.923e+03  3.530e+03  -2.528 0.011924 *  
## weekday4                -1.468e+04  3.544e+03  -4.142 4.35e-05 ***
## weekday5                -1.320e+04  3.526e+03  -3.744 0.000213 ***
## weekday6                -2.475e+04  3.652e+03  -6.776 5.43e-11 ***
## weekday7                -2.212e+04  3.623e+03  -6.105 2.80e-09 ***
## margem                  -3.820e+05  4.085e+04  -9.352  < 2e-16 ***
## desconto                 8.442e+00  2.929e-01  28.824  < 2e-16 ***
## is_brz_holidayNoholiday  8.669e+03  6.611e+03   1.311 0.190608    
## brz_seasonSpring        -5.163e+02  7.031e+03  -0.073 0.941507    
## brz_seasonSummer        -4.374e+03  5.886e+03  -0.743 0.457842    
## brz_seasonWinter         1.792e+04  5.925e+03   3.024 0.002683 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17960 on 341 degrees of freedom
## Multiple R-squared:  0.835,  Adjusted R-squared:  0.8238 
## F-statistic: 75.02 on 23 and 341 DF,  p-value: < 2.2e-16
# Retirada de outliers identificados pelo metodo Distancia de Cook

cooksd <- cooks.distance(modelo)

baba_train$cooktest <- cooksd
baba_train$outlier <- ifelse(2*mean(cooksd) < baba_train$cooktest, 1, 0) #Identificar valores 2 vezes maior que a média da distancia de cook

baba_train <- baba_train %>% 
  filter(outlier != 1) %>% 
  select(-cooktest, -outlier)

## Novo modelo sem outliers

modelo <- lm(formula = venda ~ .,
                       data = baba_train)

summary(modelo)
## 
## Call:
## lm(formula = venda ~ ., data = baba_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27353  -8684  -1958   6992  42235 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.610e+05  1.534e+04  10.494  < 2e-16 ***
## mesagosto               -3.066e+04  6.650e+03  -4.610 5.76e-06 ***
## mesdezembro             -3.457e+04  6.164e+03  -5.609 4.30e-08 ***
## mesfevereiro            -1.120e+04  5.807e+03  -1.928 0.054706 .  
## mesjaneiro              -7.538e+03  5.815e+03  -1.296 0.195792    
## mesjulho                -3.257e+04  5.847e+03  -5.569 5.30e-08 ***
## mesjunho                -1.484e+04  3.967e+03  -3.742 0.000215 ***
## mesmaio                 -2.168e+03  3.625e+03  -0.598 0.550321    
## mesmar?o                -1.579e+03  4.539e+03  -0.348 0.728216    
## mesnovembro             -2.334e+04  6.630e+03  -3.521 0.000490 ***
## mesoutubro              -2.625e+04  6.647e+03  -3.950 9.56e-05 ***
## messetembro             -3.285e+04  5.958e+03  -5.514 7.07e-08 ***
## weekday2                -7.978e+03  2.748e+03  -2.903 0.003950 ** 
## weekday3                -1.111e+04  2.794e+03  -3.976 8.59e-05 ***
## weekday4                -1.586e+04  2.772e+03  -5.722 2.36e-08 ***
## weekday5                -1.907e+04  2.782e+03  -6.855 3.50e-11 ***
## weekday6                -3.073e+04  2.890e+03 -10.634  < 2e-16 ***
## weekday7                -2.735e+04  2.849e+03  -9.603  < 2e-16 ***
## margem                  -2.789e+05  3.419e+04  -8.158 7.18e-15 ***
## desconto                 6.749e+00  3.240e-01  20.828  < 2e-16 ***
## is_brz_holidayNoholiday  1.496e+04  5.407e+03   2.767 0.005984 ** 
## brz_seasonSpring         1.296e+04  5.589e+03   2.319 0.021016 *  
## brz_seasonSummer         4.512e+03  4.588e+03   0.984 0.326068    
## brz_seasonWinter         1.177e+04  4.605e+03   2.557 0.011016 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13740 on 331 degrees of freedom
## Multiple R-squared:  0.8417, Adjusted R-squared:  0.8307 
## F-statistic: 76.51 on 23 and 331 DF,  p-value: < 2.2e-16

Desenvolvimento da tabela com as predições do modelo

# Predict

baba_test <- baba_all %>%
  filter(conj == "test") %>% 
  select(-conj, -venda, -date, -outmg, -outdesc, - month)

pred <- data.frame(x = predict(object = modelo, baba_test,
          interval = "confidence",
          level = 0.95))

Trasnformações finais e geração do arquivo .csv

# transformações finais e arquivo para submissão

pred$id <- 1:31

pred <- pred %>%
  select(id, x.fit)

pred <- rename(pred, venda = "x.fit")

write.table(pred, 'predict_augusto.csv', row.names = FALSE, col.names = FALSE, sep=",")